{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-06T23:48:01.991368Z",
     "start_time": "2018-12-06T23:48:01.225480Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "import os, tarfile, sys\n",
    "from pathlib import Path\n",
    "from time import time\n",
    "from pprint import pprint\n",
    "from collections import Counter\n",
    "\n",
    "import numpy as np\n",
    "from numpy.random import choice\n",
    "import pandas as pd\n",
    "\n",
    "import spacy\n",
    "\n",
    "from gensim.models.word2vec import LineSentence\n",
    "from gensim.models.phrases import Phrases, Phraser"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-06T23:48:03.594481Z",
     "start_time": "2018-12-06T23:48:03.586268Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "pd.set_option('float_format', '{:,.2f}'.format)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-06T23:48:05.233653Z",
     "start_time": "2018-12-06T23:48:05.231815Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "LANGUAGES = ['en', 'es']\n",
    "language_dict = dict(zip(LANGUAGES, ['English', 'Spanish']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-06T23:48:06.189629Z",
     "start_time": "2018-12-06T23:48:06.187637Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "def format_time(t):\n",
    "    m, s = divmod(t, 60)\n",
    "    h, m = divmod(m, 60)\n",
    "    return '{:02.0f}:{:02.0f}:{:02.0f}'.format(h, m, s)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Preprocess Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "### TED 2013 English & Spanish"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-06T23:48:18.238647Z",
     "start_time": "2018-12-06T23:48:18.236727Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "SOURCE = 'TED'\n",
    "FILE_NAME = 'TED2013'\n",
    "DATA_DIR = Path('..', 'data')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "Data source: http://opus.nlpl.eu/TED2013.php"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-06T23:48:22.643258Z",
     "start_time": "2018-12-06T23:48:22.621225Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "http://www.ted.com/talks/stephen_palumbi_following_the_mercury_trail.html\n",
      "There's a tight and surprising link between the ocean's health and ours, says marine biologist Stephen Palumbi. He shows how toxins at the bottom of the ocean food chain find their way into our bodies, with a shocking story of toxic contamination from a Japanese fish market. His work points a way forward for saving the oceans' health -- and humanity's.\n",
      "fish,health,mission blue,oceans,science\n",
      "899\n",
      "Stephen Palumbi: Following \n"
     ]
    }
   ],
   "source": [
    "filename = DATA_DIR / 'TED' / 'TED2013.en'\n",
    "print(filename.read_text()[:500])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Tokenize & Clean Sentences\n",
    "\n",
    "Models expect data provided as a single sentence per line. We'll remove punctuation after using `spaCy`'s parser to tokenize the input text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-06T23:50:07.184530Z",
     "start_time": "2018-12-06T23:50:07.180561Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "def read_sentences(path, min_sent_length=3):\n",
    "    stats = pd.DataFrame()\n",
    "    sentences = []\n",
    "    skipped, word_count = 0, 0\n",
    "    \n",
    "    with path.open() as source:\n",
    "        for sentence in source:\n",
    "            # remove short sentences and urls (for TED data)\n",
    "            n_words = len(sentence.split())\n",
    "            if n_words < min_sent_length or sentence.startswith('http:///'):\n",
    "                skipped += 1\n",
    "            else:\n",
    "                word_count += n_words\n",
    "                sentences.append(sentence.strip())\n",
    "                \n",
    "    stats = pd.Series({'Sentences': len(sentences),\n",
    "                       '# Words': word_count,\n",
    "                       'Skipped': skipped})\n",
    "    return sentences, stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-06T23:50:09.575500Z",
     "start_time": "2018-12-06T23:50:09.571893Z"
    },
    "run_control": {
     "marked": false
    },
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "def clean_sentences(sents, nlp, path, lang):\n",
    "    exclude = ['PUNCT', 'SYM', 'X']\n",
    "    start = time()\n",
    "    vocab = Counter()\n",
    "    sents = nlp.pipe(sents)\n",
    "    d = []\n",
    "    with open(path / 'ngrams_1.txt'.format(language), 'a') as f:\n",
    "        for i, sent in enumerate(sents):\n",
    "            if i % 20000 == 0 and i > 0:\n",
    "                print(i, end=' ')\n",
    "            d.extend([[i, w.text, w.pos_] for w in sent])\n",
    "            clean_sentence = [w.text.lower() for w in sent if w.pos_ not in exclude]\n",
    "            vocab.update(clean_sentence)\n",
    "            f.write(' '.join(clean_sentence) + '\\n')\n",
    "\n",
    "    vocab = pd.Series(vocab).sort_values(ascending=False).to_frame('count')\n",
    "    with pd.HDFStore(path.parent / 'vocab.h5') as store:\n",
    "        store.put('/'.join([lang, 'vocab']), vocab)\n",
    "        store.put('/'.join([lang, 'tokens']), pd.DataFrame(d, columns=['sent_id', 'token', 'pos']))\n",
    "    duration = time() - start\n",
    "    print('\\n\\tDuration: ', format_time(duration))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-07T00:04:12.266702Z",
     "start_time": "2018-12-06T23:50:10.355827Z"
    },
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "en: 20000 40000 60000 80000 100000 120000 140000 \n",
      "\tDuration:  00:07:01\n",
      "es: 20000 40000 60000 80000 100000 120000 140000 \n",
      "\tDuration:  00:06:49\n"
     ]
    }
   ],
   "source": [
    "sentences, stats = {}, pd.DataFrame()\n",
    "\n",
    "for language in LANGUAGES:\n",
    "    source_path =  DATA_DIR / SOURCE / '{}.{}'.format(FILE_NAME, language)\n",
    "    sentences[language], stats[language_dict[language]] = read_sentences(source_path)\n",
    "    \n",
    "    print(language, end=': ')\n",
    "    target_path = Path('vocab', SOURCE, language)\n",
    "    if not target_path.exists():\n",
    "        target_path.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "    clean_sentences(sentences[language], spacy.load(language), target_path, language)    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Corpus Summary Stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-07T00:07:34.749099Z",
     "start_time": "2018-12-07T00:07:34.741418Z"
    },
    "scrolled": true,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>English</th>\n",
       "      <th>Spanish</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th># Words</th>\n",
       "      <td>2,640,928</td>\n",
       "      <td>2,548,942</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sentences</th>\n",
       "      <td>152,729</td>\n",
       "      <td>151,850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Skipped</th>\n",
       "      <td>5,166</td>\n",
       "      <td>6,045</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             English    Spanish\n",
       "# Words    2,640,928  2,548,942\n",
       "Sentences    152,729    151,850\n",
       "Skipped        5,166      6,045"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats.applymap(lambda x: '{:,d}'.format(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-07T00:07:37.530218Z",
     "start_time": "2018-12-07T00:07:37.523597Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "with pd.HDFStore(Path('vocab', SOURCE, 'vocab.h5')) as store:\n",
    "    store.put('stats', stats)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Inspect Result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-07T00:07:40.045413Z",
     "start_time": "2018-12-07T00:07:40.040404Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"There's a tight and surprising link between the ocean's health and ours, says marine biologist Stephen Palumbi. He shows how toxins at the bottom of the ocean food chain find their way into our bodies, with a shocking story of toxic contamination from a Japanese fish market. His work points a way forward for saving the oceans' health -- and humanity's.\",\n",
       " 'Stephen Palumbi: Following the mercury trail',\n",
       " 'It can be a very complicated thing, the ocean.']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentences['en'][:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-07T00:07:40.972554Z",
     "start_time": "2018-12-07T00:07:40.970041Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Existe una estrecha y sorprendente relación entre nuestra salud y la salud del océano, dice el biologo marino Stephen Palumbi. Nos muestra, através de una impactante historia acerca de la contaminación tóxica en el mercado pesquero japonés, como las toxinas de la cadena alimenticia del fondo oceánico llegan a nuestro cuerpo.',\n",
       " 'Stephen Palumbi: Siguiendo el camino del mercurio.',\n",
       " 'El océano puede ser una cosa muy complicada.']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentences['es'][:3] "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Create n-grams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-07T00:07:42.030674Z",
     "start_time": "2018-12-07T00:07:42.026289Z"
    },
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "def create_ngrams(language, max_length=3):\n",
    "    \"\"\"Using gensim to create ngrams\"\"\"\n",
    "    \n",
    "    path = Path('vocab', SOURCE, language)\n",
    "    n_grams = pd.DataFrame()\n",
    "    start = time()\n",
    "    for n in range(2, max_length + 1):\n",
    "        print(n, end=' ')\n",
    "        \n",
    "        sentences = LineSentence(str(path / 'ngrams_{}.txt'.format(n-1)))\n",
    "        phrases = Phrases(sentences, threshold=100, min_count=10)\n",
    "\n",
    "        s = pd.Series({k.decode('utf-8'): v for k,\n",
    "                       v in phrases.export_phrases(sentences)}) \n",
    "        s = s.to_frame('score').reset_index().rename(\n",
    "            columns={'index': 'phrase'}).assign(length=n)\n",
    "        \n",
    "        n_grams = pd.concat([n_grams, s])\n",
    "        grams = Phraser(phrases)\n",
    "        sentences = grams[sentences]\n",
    "        \n",
    "        with open(path / 'ngrams_{}.txt'.format(n), 'w') as f:\n",
    "            for sentence in sentences:\n",
    "                f.write(' '.join(sentence) + '\\n')\n",
    "                \n",
    "    n_grams = n_grams.sort_values('score', ascending=False)\n",
    "    n_grams.phrase = n_grams.phrase.str.replace('_', ' ')\n",
    "    n_grams['ngram'] = n_grams.phrase.str.replace(' ', '_')\n",
    "    \n",
    "    with pd.HDFStore(Path(path.parent / 'vocab.h5')) as store:\n",
    "        store.put('/'.join([language, 'ngrams']), n_grams)\n",
    "        \n",
    "    print('\\n\\tDuration: ', format_time(time() - start))\n",
    "    print('\\tngrams: {:,d}\\n'.format(len(n_grams)))\n",
    "    print(n_grams.groupby('length').size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-07T00:10:40.901962Z",
     "start_time": "2018-12-07T00:07:45.209467Z"
    },
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " en 2 3 \n",
      "\tDuration:  00:00:36\n",
      "\tngrams: 483\n",
      "\n",
      "length\n",
      "2    433\n",
      "3     50\n",
      "dtype: int64\n",
      "\n",
      " es 2 3 \n",
      "\tDuration:  00:00:37\n",
      "\tngrams: 508\n",
      "\n",
      "length\n",
      "2    462\n",
      "3     46\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "for language in LANGUAGES:\n",
    "    print('\\n', language, end=' ')\n",
    "    create_ngrams(language)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "47px",
    "left": "1227px",
    "top": "40px",
    "width": "212px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
